!! Copyright (C) 2009,2010,2011,2012  Marco Restelli
!!
!! This file is part of:
!!   FEMilaro -- Finite Element Method toolkit
!!
!! FEMilaro is free software; you can redistribute it and/or modify it
!! under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 3 of the License, or
!! (at your option) any later version.
!!
!! FEMilaro is distributed in the hope that it will be useful, but
!! WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
!! General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with FEMilaro; If not, see <http://www.gnu.org/licenses/>.
!!
!! author: Marco Restelli                   <marco.restelli@gmail.com>

!>\brief
!! This module contains the representation of the Stokes or
!! Navier-Stokes matrix.
!!
!! \n
!!
!! There are three main points concerning the representation of the
!! system matrix:
!! <ul>
!!  <li> the ordering of the unknowns;
!!  <li> the fact that different components of the matrix can be
!!  computed at different times, typically because some of them are
!!  constant (the diffusion part) while others are not (the advection
!!  part);
!!  <li> the ordering of the matrix entries in the \c t_tri
!!  representation of the system matrix (this last point affects only
!!  this module, since the details of the matrix representation are
!!  private).
!! </ul>
!!
!! Concerning the ordering of the unknown, we use:
!!  \f{displaymath}{
!!   \left[ \begin{array}{c}
!!    u^1_{x_1} \\[2mm]
!!    u^1_{x_2} \\[2mm]
!!    \vdots \\[2mm]
!!    u^1_{x_d} \\[2mm]
!!    \vdots \\[2mm]
!!    u^{N_u}_{x_1} \\[2mm]
!!    u^{N_u}_{x_2} \\[2mm]
!!    \vdots \\[2mm]
!!    u^{N_u}_{x_d} \\[2mm]
!!    p^1 \\[2mm]
!!    \vdots \\[2mm]
!!    p^{N_p} \\[2mm]
!!    \lambda
!!   \end{array}\right]
!!  \f}
!! where subscripts indicate velocity components, superscripts
!! indicate the degrees of freedom of the finite element space,
!! \f$N_u\f$ is the number of (scalar) degrees of freedom of the
!! velocity FE space and \f$N_p\f$ is the number of pressure degrees
!! of freedom. The Lagrange multiplier \f$\lambda\f$ is present only
!! if <tt>div_lag.eqv..true.</tt>. The unknown vector
!! includes all the degrees of freedom, i.e. it also includes
!! statically condensed and Dirichlet degrees of freedom. The linear
!! system, however, is built taking into account the static
!! condensation, so that in general the size of the system matrix does
!! not match the size of the unknown vector, due to the condensed
!! degrees of freedom. The system matrix includes the Dirichlet
!! degrees of freedom, and index vectors are provided to partition it
!! accordingly (see the details in \c ns_partmat). Denoting the
!! matrices of the problem before static condensation by \f$A\f$
!! (velocity matrix, including both viscosity and advection), \f$B\f$
!! (divergence matrix), \f$C\f$ (pressure stabilization matrix) and
!! \f$\Lambda\f$ (incompressibility Lagrange multiplier), and using
!! the two indexes 1 and 2 to collectively identify the velocity
!! degrees of freedom kept and eliminated in the static condensation,
!! respectively, the final linear system reads:
!! \f{displaymath}{
!!  \left[ \begin{array}{ccc}
!!   A_{11}-A_{12}A_{22}^{-1}A_{21} & B^T_1-A_{12}A_{22}^{-1}B_2^T
!!   & 0 \\[2mm]
!!   B_1-B_2A_{22}^{-1}A_{21} & C-B_2A_{22}^{-1}B^T_2
!!   & \Lambda^T \\[2mm]
!!   0 & \Lambda & 0
!!  \end{array}\right]
!!  \left[ \begin{array}{c}
!!   u_1 \\[2mm]
!!    p \\[2mm]
!!   \lambda
!!  \end{array}\right] = 
!!  \left[ \begin{array}{c}
!!   f_1 - A_{12}A_{22}^{-1}f_2 \\[2mm]
!!   -B_2A_{22}^{-1}f_2 \\[2mm]
!!    0
!!  \end{array}\right].
!! \f}
!! The eliminated degrees of freedom can then be recovered as
!! \f{displaymath}{
!!  u_2 = A_{22}^{-1}\left(f_2 - B_2^Tp - A_{21}u_1\right).
!! \f}
!! In general, the static condensation is performed for the velocity
!! bubble degrees of freedom. However, side based velocity
!! stabilizations result in additional coupling for such degrees of
!! freedom, which then can not be eliminated any more on a <em>per
!! element</em> basis. For this reason, whenever using a side based
!! velocity stabilization, we avoid any static condensation even if
!! the velocity finite element space includes bubble degrees of
!! freedom.
!!
!! \note To ensure a consistent treatment of the static ondensation,
!! the whole module relies on the value of the variable \c stat_cnd,
!! so that each function can eliminate or not the bubble velocity
!! degrees of freedom independently from the velocity FE space.
!!
!! The fact that various matrix components could need updates at
!! different times is handled by recomputing completely the whole
!! matrix whenever there is a component which needs an update. This is
!! not optimal in terms of efficiency, however it avoids the
!! complications of a selective update. Such a selective update would
!! be straightforward in the absence of static condensation, since it
!! would be accomplished by defining three different matrices:
!! <ul>
!!  <li> stokes matrix: includes all the constant terms, which are:
!!  diffusion, pressure gradient and divergence, pressure Lagrange
!!  multiplier
!!  <li> pressure stabilization matrix
!!  <li> advection matrix.
!! </ul>
!! The complete matrix can be built as a sum of the various
!! contributions, and the summation can be carried out efficiently in
!! \c t_col form using correspondent "skeleton" index vectors (see the
!! procedure \c tri2col_skeleton in \c mod_sparse). However, if there
!! is static condensation, the local inverse \f$A_{22}^{-1}\f$ must be
!! computed taking into account both the constant and the not constant
!! contributions, so that it cannot be precomputed.
!! \bug Add a more detailed treatment of the various terms, so that in
!! the case of no static condensation only the matrix components
!! needing update are recomputed.
!!
!! The ordering of the matrix entries in the \c t_tri representation
!! is as follows. The outer ordering is given by element terms
!! followed by side terms. Each group of matrix entries is then 
!! ordered as specified in a set of fields of a \c t_ls object. Such
!! order is set by \c ns_partmat, and the remaining functions don't
!! need to know the details of this ordering, since they always refer
!! to the indexes stored in the \c t_ls object.
!! \note The side based stabilizations (both for pressure and for
!! velocity) result in additional couplings, meaning that some
!! elements of the global matrix which would be zero in absence of the
!! side terms are now nonzero. Hence, the side based stabilizations
!! affect the sparsity pattern of the global matrix, and thus require
!! additional local matrices which can not be included in the local
!! element matrices.
!!
!! \par
!!
!! \note Concerning upwinding and velocity stabilizations, we have
!! that:
!! <ul>
!!  <li> element based stabilizations are not considered here, since
!!  they are dealt with at the element level in \c mod_cg_ns_locmat.
!!  <li> side based stabilization have to be considered in this module
!!  for the following reasons:
!!  <ul>
!!   <li> they prevent static condensation of velocity degrees of
!!   freedom
!!   <li> they require an additional side loop in the construction of
!!   the global system.
!!  </ul>
!! </ul>
!!
!! The data defining the size of the global linear system are
!! collected in a dedicated type, which is set by \c ns_partmat and
!! used by the other module functions.
!!
!! \note For the pressure stabilzation, there are currently two
!! options:
!! <ul>
!!  <li> <tt>p_stab_type.eq.'p-grad'</tt> is a weakly consistent,
!!  element based stabilization
!!  <li> <tt>p_stab_type.eq.'pi-p-grad'</tt> is a strongly consistent,
!!  side based stabilization.
!! </ul>
!!
!! \section ddc_linsys Domain decomposition
!!
!! The implication of domain decomposition of the present module are
!! relatively small, however some points must be taken into account.
!! When there are no side based stabilizations, the sole implication
!! of the domain decomposition is that one needs to determine the size
!! of the global system and the maps from local degrees of freedom to
!! the global ones. When introducing side based stabilizations,
!! however, there is an additional coupling between degrees of freedom
!! belonging to different subdomains, so that the ghost degrees of
!! freedom must be taken into account (for the same reason which makes
!! it impossible expressing the side contributions in terms of the
!! local element matrices). This affects the following variables:
!! <ul>
!!  <li> ghost dofs are introduced in \c dofs_nat, \c dofs_dir, \c
!!  uuu, \c uuu1, \c uuu2 and so on, with the following order:
!!  \f{displaymath}{
!!   \left[ u_{nsc} \,,\, u_{sc} \,,\, u_g \,,\, p \,,\, p_g \,,\,
!!   \lambda \right]
!!  \f}
!!  where the subscripts stand for "no static condensation", "static
!!  condensation" and "ghost"; we assume that when \f$u_g\f$ is not
!!  empty, then \f$u_{sc}\f$ is empty; the same ordering is followed
!!  for local and global dofs
!!  <li> the \c t_ls object includes the ghost dofs in the
!!  specification of the system size
!!  <li> the system matrix includes the ghost dofs
!! </ul>
!<----------------------------------------------------------------------
module mod_cg_ns_linsystem

!-----------------------------------------------------------------------

 use mod_kinds, only: &
   mod_kinds_initialized, &
   wp

 use mod_messages, only: &
   mod_messages_initialized, &
   error,   &
   warning, &
   info

 !$ use omp_lib

 !$ use mod_omp_utils, only: &
 !$   mod_omp_utils_initialized, &
 !$   detailed_timing_omp, &
 !$   omput_push_key,    &
 !$   omput_pop_key,     &
 !$   omput_start_timer, &
 !$   omput_close_timer, &
 !$   omput_write_time

 use mod_linal, only: &
   mod_linal_initialized, &
   invmat, invmat_chol

 use mod_sparse, only: &
   mod_sparse_initialized, &
   t_intar,     &
   t_col,       &
   t_tri,       &
   t_rp,        &
   t_pm_sk,     &
   new_tri,     &
   tri2col_skeleton_part, &
   operator(*), &
   clear

 !$ use mod_output_control, only: &
 !$   mod_output_control_initialized, &
 !$   elapsed_format

 use mod_master_el, only: &
   mod_master_el_initialized, &
   t_lagnodes

 use mod_grid, only: &
   mod_grid_constructor, &
   mod_grid_destructor,  &
   mod_grid_initialized, &
   t_grid, t_ddc_grid

 use mod_bcs, only: &
   mod_bcs_initialized, &
   t_bcs

 use mod_cgdofs, only: &
   mod_cgdofs_initialized, &
   t_cgdofs, t_ddc_cgdofs, &
   t_ghost_dof

 use mod_base, only: &
   mod_base_initialized, &
   t_base

 use mod_testcases, only: &
   mod_testcases_initialized, &
   i_coeff_dir

 use mod_cg_ns_locmat, only: &
   mod_cg_ns_locmat_constructor, &
   mod_cg_ns_locmat_destructor,  &
   cg_ns_locmat,     &
   cg_ns_locpstab,   &
   cg_ns_locustab,   &
   cg_ns_locmassmat, &
   cg_ns_locrhs,     &
   t_bcs_error

!-----------------------------------------------------------------------
 
 implicit none

!-----------------------------------------------------------------------

! Module interface

 public :: &
   mod_cg_ns_linsystem_constructor, &
   mod_cg_ns_linsystem_destructor,  &
   mod_cg_ns_linsystem_initialized, &
   t_ls,                            &
   ns_mat, ns_partmat,       &
   ns_packsol, ns_unpacksol, &
   trim_sol, untrim_sol,     &
   ns_dirdata, ns_rhs

 private

!-----------------------------------------------------------------------

! Module types and parameters

 ! public members

 !> Linear system size
 !!
 !! Collect all the data defining the size of the linear system. The
 !! type is public, so that the variable can be defined in the main
 !! program, but the components are private, because they refer to the
 !! specific representation of the linear system.
 !<
 type t_ls
   private
   !> use static condensation
   logical :: stat_cnd
   !> number of Dirichlet dofs (must be velocity dofs)
   integer :: ndofs_dir
   !> number of scalar Dirichlet dofs (<tt>ndofs_dir/d</tt>)
   integer :: nsdofs_dir
   !> number of natural velocity dofs
   integer :: ndofs_natv
   !> number of scalar natural velocity dofs
   integer :: nsdofs_natv
   !> number of scalar natural velocity dofs excluding ghost
   integer :: nsdofs_natv_no_ghost
   !> number of natural pressure dofs (all the pressure dofs)
   integer :: ndofs_natp
   !> number of natural lambda dofs (1, if lambda is present)
   integer :: ndofs_natl
   !> total number of natural dofs (excluding static condensation)
   integer :: ndofs_nat
   !> number of local velocity dofs
   integer :: pu
   !> number of local pressure dofs
   integer :: pp
   !> number of local scalar velocity dofs after static condensation
   integer :: spus
   !> number of local velocity dofs after static condensation
   integer :: pus
   !> number of condensed local velocity dofs
   integer :: cpus
   !> total number of nonzero entries per element
   integer :: enz
   !> local indexes arrays: element terms
   integer, allocatable ::      &
     euu(:,:),eup(:,:),eul(:) , &
     epu(:,:),epp(:,:),epl(:) , &
     elu(  :),elp(  :)
   integer :: ell
   !> total number of nonzero entries per side
   integer :: snz
   !> local indexes arrays: side terms
   integer, allocatable ::      &
     suu(:,:),spp(:,:)
   !> matrix dimension: velocity
   integer :: nu
   !> matrix dimension: pressure
   integer :: np
   !> matrix dimension
   integer :: n
 end type t_ls

 ! private members

! Module variables

 ! public members
 logical, protected ::               &
   mod_cg_ns_linsystem_initialized = .false.
 ! private members
 character(len=*), parameter :: &
   this_mod_name = 'mod_cg_ns_linsystem'

!-----------------------------------------------------------------------

contains

!-----------------------------------------------------------------------

 subroutine mod_cg_ns_linsystem_constructor(ubase,pbase,p_stab_coeff, &
                                         antisym_adv,upw,u_stab_coeff )
  type(t_base), intent(in) :: ubase, pbase
  real(wp), intent(in) :: p_stab_coeff
  logical, intent(in), optional :: antisym_adv
  character(len=*), intent(in), optional :: upw
  real(wp), intent(in), optional :: u_stab_coeff

  character(len=*), parameter :: &
    this_sub_name = 'constructor'

   !Consistency checks ---------------------------
   if( (mod_kinds_initialized.eqv..false.) .or. &
!$     ( (detailed_timing_omp.eqv..true.).and. &
!$       (mod_omp_utils_initialized.eqv..false.) ) .or. &
       (mod_linal_initialized.eqv..false.) .or. &
      (mod_sparse_initialized.eqv..false.) .or. &
!$ (mod_output_control_initialized.eqv..false.).or.&
   (mod_master_el_initialized.eqv..false.) .or. &
        (mod_grid_initialized.eqv..false.) .or. &
         (mod_bcs_initialized.eqv..false.) .or. &
      (mod_cgdofs_initialized.eqv..false.) .or. &
        (mod_base_initialized.eqv..false.) .or. &
   (mod_testcases_initialized.eqv..false.) ) then
     call error(this_sub_name,this_mod_name, &
                'Not all the required modules are initialized.')
   endif
   if(mod_cg_ns_linsystem_initialized.eqv..true.) then
     call warning(this_sub_name,this_mod_name, &
                  'Module is already initialized.')
   endif
   !----------------------------------------------

   call mod_cg_ns_locmat_constructor( ubase, pbase, p_stab_coeff, &
                                   antisym_adv, upw, u_stab_coeff )

   mod_cg_ns_linsystem_initialized = .true.
 end subroutine mod_cg_ns_linsystem_constructor

!-----------------------------------------------------------------------
 
 subroutine mod_cg_ns_linsystem_destructor()
  character(len=*), parameter :: &
    this_sub_name = 'destructor'
   
   !Consistency checks ---------------------------
   if(mod_cg_ns_linsystem_initialized.eqv..false.) then
     call error(this_sub_name,this_mod_name, &
                'This module is not initialized.')
   endif
   !----------------------------------------------

   call mod_cg_ns_locmat_destructor()

   mod_cg_ns_linsystem_initialized = .false.
 end subroutine mod_cg_ns_linsystem_destructor

!-----------------------------------------------------------------------
 
 !> Partition of the system matrix according to the Dirichlet bcs.
 !!
 !! The system matrix \f$M\f$ is partitioned into four submatrices as
 !! \f{displaymath}{
 !!  M = \left[\begin{array}{cc}
 !!    M_{11} & M_{12} \\\
 !!    M_{21} & M_{22}
 !!  \end{array}\right]
 !! \f}
 !! where index 1 corresponds to unknown degrees of freedom and index
 !! 2 corresponds to Dirichlet data. Various arrays and variables
 !! related with the linear system are also allocated or defined:
 !! <ul>
 !!  <li> natural and Dirichlet indexes: \c dofs_nat, \c dofs_dir
 !!  <li> unknown and rhs vectors: \c uuu, \c uuu1, \c uuu2, \c
 !!  uuu1_old, \c fff, \c fff1, \c fff2, \c rhs1
 !!  <li> static condensation: \c stat_cnd.
 !! </ul>
 !!
 !! \note This subroutine ignores the nodes eliminated in the static
 !! condensation (i.e. we consider here the dofs which appear in the
 !! final linear system). An exception is \c uuu, which is allocated
 !! to a size which includes the statically condensed degrees of
 !! freedom.
 !!
 !! \note \c uuu includes also the statically condensed degrees of
 !! freedom, so that <tt> size(uuu) .ne. size(uuu1)+size(uuu2)</tt>.
 !! For this reason, one should never copy \c uuu1 and \c uuu2 into \c
 !! uuu directly, but should always use \c ns_packsol.
 !!
 !! \note The variable \c mmmt holds the transposed system matrix.
 !<
 pure &
 !$ subroutine omp_dummy_2(); end subroutine omp_dummy_2
 subroutine ns_partmat(mmmt,dofs_nat,dofs_dir,uuu,uuu1,uuu2,          &
      fff,fff1,fff2,rhs1,ls,uref_nodes,ubase,pbase,grid,udofs,pdofs,  &
  div_lag,u_stab,imp_adv,p_stab,p_stab_type,err , ddc_grid,ddc_udofs, &
      ddc_pdofs,gn,gdofs_nat)
  type(t_pm_sk), target, intent(out) :: mmmt
  integer,  allocatable, intent(out) :: dofs_nat(:), dofs_dir(:)
  real(wp), allocatable, intent(out) :: uuu(:), uuu1(:), uuu2(:), &
        fff(:), fff1(:), fff2(:), rhs1(:)
  type(t_ls), intent(out) :: ls !< system structure
  type(t_lagnodes), intent(in) :: uref_nodes(:) !< reference nodes
  type(t_base), intent(in) :: ubase, pbase !< FE bases
  type(t_grid), target, intent(in) :: grid !< grid
  type(t_cgdofs), intent(in) :: udofs, pdofs
  logical, intent(in) :: div_lag !< use the pressure Lagrange multiplier
  logical, intent(in) :: u_stab !< side based velocity stabilization
  logical, intent(in) :: imp_adv ! implicit advection
  logical, intent(in) :: p_stab !< pressure stabilization
  character(len=*), intent(in) :: p_stab_type !< type of p stab.
  type(t_bcs_error), intent(out) :: err ! error messages
  !> ddc variables
  type(t_ddc_grid), target, intent(in) :: ddc_grid !< ddc grid
  type(t_ddc_cgdofs), intent(in) :: ddc_udofs, ddc_pdofs
  integer, intent(out) :: gn !< global system size
  !> local to global map for the natural degrees of freedom in case of
  !! domain decomposition
  !<
  integer, allocatable, intent(out) :: gdofs_nat(:)

  logical :: u_u_stab, eb_p_stab, sb_p_stab
  ! indexes
  integer :: ie, is, i, id, ik, ii, iii, j, jd, jk, jj, jjj, &
    idx(udofs%d), pos, s_e, s_s
  integer, allocatable :: mmmi(:), mmmj(:), udofs_el(:,:), pdofs_el(:,:)
  ! working arrays and variables
  type(t_tri) :: t
  type(t_intar) :: idij(2)
  type(t_ghost_dof) :: gd
 
   !--------------------------------------------------------------------
   ! Check whether we perform the static condensation
   ! - the static condensation is done if there are bubble type
   ! velocity degrees of freedom
   ! - side based velocity stabilization prevents static condensation
   u_u_stab = u_stab.and.imp_adv
   ls%stat_cnd = ( uref_nodes(grid%d)%nn.gt.0 ).and.( .not.u_u_stab )
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! Check what to do with the pressure stabilization
   eb_p_stab = p_stab.and.( trim(p_stab_type).eq.'p-grad' ) ! el. based
   sb_p_stab = p_stab.and.( trim(p_stab_type).eq.'pi-p-grad' ) ! side b.
   if(p_stab.and.((.not.eb_p_stab).and.(.not.sb_p_stab))) then
     err%lerr = .true.
err%message = 'Unknown pressure stabilization "'//trim(p_stab_type)//'"'
     return
   else
     err%lerr = .false.
   endif
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! Count the natural and Dirichlet dofs 
   ! - natural dofs can be velocity, pressure or lambda, Dirichlet
   ! dofs are velocity only;
   ! - we only count here dofs which are not eliminated in the static
   ! condensation
   ls%nsdofs_dir  = size(udofs%dir_dofs,2)
   if(u_u_stab) ls%nsdofs_dir = ls%nsdofs_dir+ddc_udofs%n_ghost_dir_dofs
   ls%ndofs_dir   = udofs%d*ls%nsdofs_dir
   ls%nsdofs_natv = size(udofs%nat_dofs)
   if(ls%stat_cnd) & ! we assume that u_u_tsab implies no stat_cnd
     ls%nsdofs_natv = ls%nsdofs_natv - udofs%ndofsd(udofs%d)
   ls%nsdofs_natv_no_ghost = ls%nsdofs_natv
   if(u_u_stab) ls%nsdofs_natv=ls%nsdofs_natv+ddc_udofs%n_ghost_nat_dofs
   ls%ndofs_natv  = udofs%d*ls%nsdofs_natv
   ls%ndofs_natp  =      pdofs%ndofs
   if(sb_p_stab) ls%ndofs_natp = ls%ndofs_natp  +ddc_pdofs%n_ghost_dofs
   if(div_lag) then
     ls%ndofs_natl = 1
   else
     ls%ndofs_natl = 0
   endif
   ls%ndofs_nat = ls%ndofs_natv+ls%ndofs_natp+ls%ndofs_natl
   allocate(  dofs_nat(ls%ndofs_nat) , dofs_dir(ls%ndofs_dir) )
   allocate( gdofs_nat(ls%ndofs_nat) )
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! Set the dofs indices
   idx = (/ (i, i=1,udofs%d) /) - 1 ! dofs indexes start from 0
   do i=1,size(udofs%dir_dofs,2) ! exclude ghost dofs
     is = (i-1)*udofs%d+1
     ie =    i *udofs%d
     dofs_dir(is:ie) = (udofs%dir_dofs(1,i)-1)*udofs%d + idx
   enddo
   do i=1,ls%nsdofs_natv_no_ghost ! velocity (without ghost dofs)
     is = (i-1)*udofs%d+1
     ie =    i *udofs%d
     dofs_nat(is:ie) = (udofs%nat_dofs(  i)-1)*udofs%d + idx
    gdofs_nat(is:ie) = (ddc_udofs%gnat_dofs(i)-1)*udofs%d + idx
   enddo
   if(u_u_stab) then
     do i=1,ddc_udofs%n_ghost_dofs ! add the velocity ghost dofs
       ii = ddc_udofs%added_ghost_dofs(i,1)
       jj = ddc_udofs%added_ghost_dofs(i,2)
       gd = ddc_udofs%ghost_dofs(ii,jj)
       is = (gd%i_dir_nat-1)*udofs%d+1
       ie =    gd%i_dir_nat *udofs%d
       if(gd%dir) then
         dofs_dir(is:ie) = (gd%i-1)*udofs%d + idx
       else
         dofs_nat(is:ie) = (gd%i-1)*udofs%d + idx
        gdofs_nat(is:ie) = (gd%gi_dir_nat-1)*udofs%d + idx
       endif
     enddo
   endif
   do i=1,pdofs%ndofs ! pressure
     dofs_nat(ls%ndofs_natv+i) = ls%ndofs_dir+ls%ndofs_natv + i-1
    gdofs_nat(ls%ndofs_natv+i) = udofs%d*ddc_udofs%ngnat_dofs    &
                                            + ddc_pdofs%gdofs(i)%gi-1
     if(ls%stat_cnd) gdofs_nat(ls%ndofs_natv+i) = &
         gdofs_nat(ls%ndofs_natv+i) - grid%d*ddc_udofs%ngdofsd(udofs%d)
   enddo
   if(sb_p_stab) then
     do i=1,ddc_pdofs%n_ghost_dofs ! add the pressure ghost dofs
       ii = ddc_pdofs%added_ghost_dofs(i,1)
       jj = ddc_pdofs%added_ghost_dofs(i,2)
       gd = ddc_pdofs%ghost_dofs(ii,jj)
       dofs_nat(ls%ndofs_natv+gd%i) = ls%ndofs_dir+ls%ndofs_natv+gd%i-1
      gdofs_nat(ls%ndofs_natv+gd%i) = udofs%d*ddc_udofs%ngnat_dofs    &
                                            + gd%gi-1
       if(ls%stat_cnd) gdofs_nat(ls%ndofs_natv+gd%i) = &
         gdofs_nat(ls%ndofs_natv+gd%i)-grid%d*ddc_udofs%ngdofsd(udofs%d)
     enddo
   endif
   if(div_lag) then
     dofs_nat(ls%ndofs_nat) = ls%ndofs_dir+ls%ndofs_nat - 1
    gdofs_nat(ls%ndofs_nat) = udofs%d*ddc_udofs%ngnat_dofs + &
                                      ddc_pdofs%ngdofs
     if(ls%stat_cnd) gdofs_nat(ls%ndofs_nat) = &
         gdofs_nat(ls%ndofs_nat) - grid%d*ddc_udofs%ngdofsd(udofs%d)
   endif
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! Build the partitioned matrix

   ! 1) compute the local and global matrix sizes
   ! local matrices
   ls%pu = grid%d * ubase%pk
   ls%pp =          pbase%pk
   ! corrected values to consider the static condensation: *s
   ls%spus = ubase%pk
   if(ls%stat_cnd) ls%spus = ls%spus - uref_nodes(grid%d)%nn
   ls%pus  = grid%d*ls%spus
   ls%cpus = ls%pu-ls%pus

   ! global matrix
   ls%nu = ls%ndofs_dir + ls%ndofs_natv
   ls%np =                ls%ndofs_natp
   ls%n = ls%nu + ls%np + ls%ndofs_natl
   gn = grid%d*ddc_udofs%ngnat_dofs + ddc_pdofs%ngdofs + ls%ndofs_natl
   if(ls%stat_cnd) gn = gn - grid%d*ddc_udofs%ngdofsd(udofs%d)

   ! precompute the local matrix indexes
   allocate( &
     ls%euu(ls%pus,ls%pus),ls%eup(ls%pus ,ls%pp ),ls%eul(ls%pus) , &
     ls%epu(ls%pp ,ls%pus),ls%epp(ls%pp  ,ls%pp ),ls%epl(ls%pp)  , &
     ls%elu(       ls%pus),ls%elp(        ls%pp )                  )
   allocate( ls%suu(2*ls%pus,2*ls%pus) , &
             ls%spp(2*ls%pp ,2*ls%pp) )

   ! nonzero entries per element
   ls%enz = 0

   ! velocity columns
   do j=1,ls%spus; do jd=1,grid%d
     jj = jd + (j-1)*grid%d
     do i=1,ls%spus; do id=1,grid%d
       ii = id + (i-1)*grid%d
       ls%enz = ls%enz + 1
       ls%euu(ii,jj) = ls%enz
     enddo;enddo
     do i=1,ls%pp
       ls%enz = ls%enz + 1
       ls%epu(i ,jj) = ls%enz
     enddo
     ! no elu terms
   enddo;enddo
   ! pressure columns
   do j=1,ls%pp
     do i=1,ls%spus; do id=1,grid%d
       ii = id + (i-1)*grid%d
       ls%enz = ls%enz + 1
       ls%eup(ii,j ) = ls%enz
     enddo;enddo
     if(ls%stat_cnd.or.eb_p_stab) then
       do i=1,ls%pp
         ls%enz = ls%enz + 1
         ls%epp(i,j) = ls%enz
       enddo
     endif
     if(div_lag) then
       ls%enz = ls%enz + 1
       ls%elp(   j ) = ls%enz
     endif
   enddo
   ! lambda column
   if(div_lag) then
     jj = 1 + ls%pp + ls%pus
     ! no eul terms
     do i=1,ls%pp
       ls%enz = ls%enz + 1
       ls%epl(i    ) = ls%enz
     enddo
     ! no ell terms
   endif

   ! nonzero entries per side
   ls%snz = 0

   ! velocity terms
   if(u_u_stab) then
     ! We exploit here the fact that there is no coupling in the
     ! velocity stabilization of the d velocity components,
     ! and thus we include the sole "diagonal" terms.
     do jk=1,2; do j=1,ls%spus
       jj = ((j-1) + (jk-1)*ls%spus)*grid%d
       do ik=1,2; do i=1,ls%spus
         ii = ((i-1) + (ik-1)*ls%spus)*grid%d
         do id=1,grid%d
           ls%snz = ls%snz + 1
           ls%suu(ii+id,jj+id) = ls%snz
         enddo
       enddo;enddo
     enddo;enddo
   endif
   ! pressure terms
   if(sb_p_stab) then
     do jk=1,2; do j=1,ls%pp
       jj = j + (jk-1)*ls%pp
       do ik=1,2; do i=1,ls%pp
         ii = i + (ik-1)*ls%pp
         ls%snz = ls%snz + 1
         ls%spp(ii,jj) = ls%snz
       enddo; enddo
     enddo; enddo
   endif

   ! size of the unrolled matrices: element, side and ddc side terms
   ii = grid%ne*ls%enz + (grid%ni+ddc_grid%nns)*ls%snz
   allocate( mmmi( ii ) , mmmj( ii ) )

   ! 2) element loop to fill mmmi and mmmj
   elem_loop: do ie=1,grid%ne

     s_e = (ie-1)*ls%enz ! element shift

     ! velocity columns
     do j=1,ls%spus; do jd=1,grid%d
       jj = jd + (j-1)*grid%d
       jjj = (udofs%dofs(j,ie)-1)*grid%d + jd

       ! velocity rows
       do i=1,ls%spus; do id=1,grid%d
         ii = id + (i-1)*grid%d
         iii = (udofs%dofs(i,ie)-1)*grid%d + id

         pos = ls%euu(ii,jj) + s_e
         mmmi(pos) = iii
         mmmj(pos) = jjj
       enddo; enddo

       ! pressure rows (and symmetric terms)
       do i=1,ls%pp
         iii = pdofs%dofs(i,ie) + ls%nu

         pos = ls%epu(i,jj) + s_e
         mmmi(pos) = iii
         mmmj(pos) = jjj
         pos = ls%eup(jj,i) + s_e
         mmmi(pos) = jjj
         mmmj(pos) = iii
       enddo

       ! lambda row: no elu nor eul terms

     enddo; enddo

     ! pressure columns
     do j=1,ls%pp
       jjj = pdofs%dofs(j,ie) + ls%nu

       ! velocity rows: done already

       ! pressure rows
       if(ls%stat_cnd.or.eb_p_stab) then
         do i=1,ls%pp
           iii = pdofs%dofs(i,ie) + ls%nu

           pos = ls%epp(i,j) + s_e
           mmmi(pos) = iii
           mmmj(pos) = jjj
         enddo
       endif

       ! lambda row (and symmetric terms)
       if(div_lag) then
         iii = 1 + ls%np + ls%nu

         pos = ls%elp(j) + s_e
         mmmi(pos) = iii
         mmmj(pos) = jjj
         pos = ls%epl(j) + s_e
         mmmi(pos) = jjj
         mmmj(pos) = iii
       endif

     enddo

     ! lambda column : no ell terms

   enddo elem_loop

   ! 3) side loop to fill mmmi and mmmj
   side_loop: do is=1,grid%ni

     s_s = (is-1)*ls%snz + grid%ne*ls%enz ! side shift

     ! velocity velocity terms
     if(u_u_stab) then
       do jk=1,2; do j=1,ls%spus
         jj = ((j-1) + (jk-1)*ls%spus)*grid%d
         do ik=1,2; do i=1,ls%spus
           ii = ((i-1) + (ik-1)*ls%spus)*grid%d
           do id=1,grid%d
             jjj = (udofs%dofs(j,grid%s(is)%ie(jk))-1)*grid%d + id
             iii = (udofs%dofs(i,grid%s(is)%ie(ik))-1)*grid%d + id
             pos = ls%suu(ii+id,jj+id) + s_s
             mmmi(pos) = iii
             mmmj(pos) = jjj
           enddo
         enddo;enddo
       enddo;enddo
     endif
     ! pressure pressure terms
     if(sb_p_stab) then
       do jk=1,2; do j=1,ls%pp
         jj = j + (jk-1)*ls%pp
         jjj = pdofs%dofs(j,grid%s(is)%ie(jk)) + ls%nu
         do ik=1,2; do i=1,ls%pp
           ii = i + (ik-1)*ls%pp
           iii = pdofs%dofs(i,grid%s(is)%ie(ik)) + ls%nu

           pos = ls%spp(ii,jj) + s_s
           mmmi(pos) = iii
           mmmj(pos) = jjj
         enddo; enddo
       enddo; enddo
     endif
   enddo side_loop

   ! 4) ddc side loop to fill mmmi and mmmj
   allocate( udofs_el(ls%spus,2) , pdofs_el(ls%pp,2) )
   ddc_side_loop: do is=1,ddc_grid%nns

     s_s = (is-1)*ls%snz + grid%ni*ls%snz + grid%ne*ls%enz ! side shift

     ! velocity velocity terms
     if(u_u_stab) then
       ! collect the "local" element dofs: for the first element, such
       ! dofs really exist, while for the second element, these are
       ! ghost dofs
       do i=1,ls%spus
         udofs_el(i,1) = udofs%dofs(i,grid%s(ddc_grid%ns(is)%i)%ie(1))
         udofs_el(i,2) = ddc_udofs%ghost_dofs(i,is)%i
       enddo
       do jk=1,2; do j=1,ls%spus
         jj = ((j-1) + (jk-1)*ls%spus)*grid%d
         do ik=1,2; do i=1,ls%spus
           ii = ((i-1) + (ik-1)*ls%spus)*grid%d
           do id=1,grid%d
             jjj = (udofs_el(j,jk)-1)*grid%d + id
             iii = (udofs_el(i,ik)-1)*grid%d + id
             pos = ls%suu(ii+id,jj+id) + s_s
             mmmi(pos) = iii
             mmmj(pos) = jjj
           enddo
         enddo;enddo
       enddo;enddo
     endif
     ! pressure pressure terms
     if(sb_p_stab) then
       do i=1,ls%pp
         pdofs_el(i,1) = pdofs%dofs(i,grid%s(ddc_grid%ns(is)%i)%ie(1))
         pdofs_el(i,2) = ddc_pdofs%ghost_dofs(i,is)%i
       enddo
       do jk=1,2; do j=1,ls%pp
         jj = j + (jk-1)*ls%pp
         jjj = pdofs_el(j,jk) + ls%nu
         do ik=1,2; do i=1,ls%pp
           ii = i + (ik-1)*ls%pp
           iii = pdofs_el(i,ik) + ls%nu

           pos = ls%spp(ii,jj) + s_s
           mmmi(pos) = iii
           mmmj(pos) = jjj
         enddo; enddo
       enddo; enddo
     endif

   enddo ddc_side_loop
   deallocate( udofs_el , pdofs_el )

   ! 5) build the matrix skeleton
   mmmi = mmmi-1; mmmj = mmmj-1 ! indexes start from 0
   t = new_tri(ls%n,ls%n,mmmj,mmmi,0.0_wp) ! notice transposition
   allocate( idij(1)%i(ls%ndofs_nat) ); idij(1)%i = dofs_nat
   allocate( idij(2)%i(ls%ndofs_dir) ); idij(2)%i = dofs_dir
   call tri2col_skeleton_part(mmmt,idij,idij,t)
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! Deallocate work arrays
   deallocate(mmmi,mmmj,idij(1)%i,idij(2)%i)
   call clear(t)
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! Allocate the output arrays
   allocate( uuu1(ls%ndofs_nat), &
     fff1(ls%ndofs_nat), rhs1(ls%ndofs_nat) )
   allocate( uuu2(ls%ndofs_dir), fff2(ls%ndofs_dir) )
   allocate( fff(ls%n) )
   allocate( uuu(ls%ndofs_dir+ls%ndofs_nat+ls%cpus*grid%ne) )
   !--------------------------------------------------------------------

 end subroutine ns_partmat

!-----------------------------------------------------------------------
 
 !> Compute the Navier-Stokes matrix.
 !!
 !! There is no accounting for the Dirichlet boundary conditions,
 !! except for the fact that the matrix partitioning is followed.
 !!
 !! There are two optional arguments to control the form of the matrix
 !! \c mmm.
 !! <ul>
 !!  <li> The optional argument \c uuu is the actual velocity
 !!  \f$\underline{u}\f$; if it is \c present, than the complete
 !!  nonlinear matrix is computed, including advection, otherwise only
 !!  the linear operator of the Stokes problem is computed.
 !!  <li> The optional argument \c sigma is a time step (more
 !!  precisely, an equivaent time step: see \c mod_time_integrators).
 !!  If it is \c present, it is assumed that a time evolution problem
 !!  is considered, so that the system matrix is rescaled as
 !!  \f{displaymath}{
 !!   M = \left[\begin{array}{ccc}
 !!     \sigma A+M & \sigma B^T&0\\\
 !!     \sigma B & \sigma C & \sigma \Lambda^T \\\
 !!      0 & \sigma \Lambda & 0
 !!   \end{array}\right].
 !!  \f}
 !!  where \f$M\f$ is the mass matrix. If \c uuu is also \c present,
 !!  then \c bbb must be also present and the right hand side is
 !!  modified as
 !!  \f{displaymath}{
 !!   \underline{f}_{\sigma}= \sigma \underline{f}+M
 !!   \underline{b}.
 !!  \f}
 !!  The right hand side is not modified for linear problems because
 !!  in such a case one would typically compute the Stokes matrix at
 !!  the beginning of the integration and then compute the right hand
 !!  side independently at each time level including both advection
 !!  and the mass term.
 !! </ul>
 !! \note To be able to perform the static condensation, all the
 !! matrix contributions must be handled together, and it's not
 !! possible to assemble separate matrices for separate operators
 !! (diffusion, advection, mass).
 !!
 !! All the four combinations of optional arguments are admissible:
 !! <ul>
 !!  <li> <tt>.not.present(uuu) .and. .not.present(sigma)</tt>
 !!  \f$\mapsto\f$ linear steady state system
 !!  <li> <tt>     present(uuu) .and. .not.present(sigma)</tt>
 !!  \f$\mapsto\f$ nonlinear steady state system
 !!  <li> <tt>.not.present(uuu) .and.      present(sigma)</tt>
 !!  \f$\mapsto\f$ linear time dependent system
 !!  <li> <tt>     present(uuu) .and.      present(sigma)</tt>
 !!  \f$\mapsto\f$ nonlinear time dependent system.
 !! </ul>
 !!
 !! The optional arguments \c a22i, \c a12a22i and \c b2a22i can be
 !! used to store the corresponding matrices for time dependent linear
 !! problems, since these are matrices used in the evaluation of the
 !! right hand side. Since in the linear case \f$A\f$ is symmetric,
 !! the latter two matrices are the transposes of \c a22ia21 and \c
 !! a22ib2t, however we compute them explicitly for clarity.
 !!
 !! \note When performing the static condensation together with
 !! implicit advection, the off-diagonal terms corresponding to
 !! \f$B\f$ and \f$B^T\f$ are no longer symmetric, since they include
 !! a contribution from the velocity matrix \f$A\f$ which in turns
 !! includes the advection term. Nevertheless, the sparsity pattern of
 !! the two matrices remains symmetric.
 !<
 pure &
 !$ subroutine omp_dummy_1(); end subroutine omp_dummy_1
 subroutine ns_mat(mmm,fff,ls,a22ia21,a22ib2t,a22if2, ddc_grid , &
   ubase,pbase,grid,bcs,udofs,pdofs,div_lag,p_stab,p_stab_type , &
   u_stab,b_err,bbb,uuu,sigma,a22i,a12a22i,b2a22i)
  type(t_pm_sk), target, intent(inout) :: mmm !< partitioned system matrix
  real(wp), intent(out) :: fff(:) !< rhs
  real(wp), intent(out), allocatable :: & !< static condensation arrays
    a22ia21(:,:,:), a22ib2t(:,:,:), a22if2(:,:)
  type(t_base), intent(in) :: ubase, pbase !< FE bases
  type(t_grid), target, intent(in) :: grid !< grid
  type(t_ddc_grid), target, intent(in) :: ddc_grid !< ddc grid
  type(t_bcs), target, intent(in) :: bcs !< boundary conditions
  type(t_cgdofs), intent(in) :: udofs, pdofs !< degrees of freedom
  logical, intent(in) :: div_lag !< use the pressure Lagrange multiplier
  logical, intent(in) :: p_stab !< use the pressure stabilization
  character(len=*), intent(in) :: p_stab_type !< type of p stab.
  logical, intent(in) :: u_stab !< use the side velocity stabilization
  type(t_ls), intent(in) :: ls !< system structure
  type(t_bcs_error), intent(out) :: b_err !< boundary error
  real(wp), intent(in), optional :: bbb(:) !< right hand side
  real(wp), intent(in), optional :: uuu(:) !< present velocity
  real(wp), intent(in), optional :: sigma !< time step
  real(wp), allocatable, intent(out), optional :: &
    a22i(:,:,:), a12a22i(:,:,:), b2a22i(:,:,:)
 
  ! indexes
  logical :: u_u_stab, eb_p_stab, sb_p_stab
  integer :: ie, is, i, id, ik, ii, j, jd, jk, jj, jjj, pos
  integer :: s_e, s_s
  real(wp), allocatable :: aak(:,:), fk(:), bbk(:,:), aak_s(:,:),    &
    fk_s(:), bbk_s(:,:), bbkt_s(:,:), cck(:,:), gk_s(:), llk(:),     &
    tpsk(:,:), tusk(:,:), a22i_l(:,:), a22ita12t(:,:), bk(:), uk(:), &
    uk2(:,:), us(:,:), mmk(:,:)
  type(t_bcs_error) :: b_err_x
  ! timing
  !$ double precision :: t0_omp, t1_omp
  !$ character(len=1000) :: message
  character(len=*), parameter :: &
    this_sub_name = 'ns_mat'
 
   !--------------------------------------------------------------------
   ! Check what to do with the pressure stabilization
   eb_p_stab = p_stab.and.( trim(p_stab_type).eq.'p-grad' ) ! el. based
   sb_p_stab = p_stab.and.( trim(p_stab_type).eq.'pi-p-grad' ) ! side b.
   !--------------------------------------------------------------------

   b_err%lerr = .false. 
   mmm%m(1,1)%ax = 0.0_wp; mmm%m(1,2)%ax = 0.0_wp
   mmm%m(2,1)%ax = 0.0_wp; mmm%m(2,2)%ax = 0.0_wp
   fff = 0.0_wp

   !--------------------------------------------------------------------
   !0) determine the sizes of the matrix contributions

   !$ t0_omp = omp_get_wtime()
   !$ if(detailed_timing_omp) then
   !$   call omput_push_key("LocalMatrices")
   !$   call omput_start_timer()
   !$ endif

   !$omp parallel private(aak, fk, bbk, aak_s, fk_s, bbk_s, bbkt_s,  &
   !$omp                  cck, gk_s, llk, tpsk, tusk, a22i_l,        &
   !$omp                  a22ita12t, bk, uk, uk2, us, mmk, ie, is,   &
   !$omp                  b_err_x, s_e, s_s, i, j, id, jd, ii, jj,   &
   !$omp                  jjj, jk, ik, pos, u_u_stab) &
   !$omp           shared(ubase, pbase, grid, bcs, udofs, pdofs,     &
   !$omp                  ls, div_lag, eb_p_stab, sb_p_stab, u_stab, &
   !$omp                  a22ia21, a22ib2t, a22i, a22if2, a12a22i,   &
   !$omp                  b2a22i, mmm, fff, bbb, uuu, b_err, sigma) &
   !$omp          default(none)

   ! allocations (matrices are allocated even if they are not used)
   allocate( aak(ls%pu,ls%pu) ,                    fk(ls%pu) , &
             bbk(ls%pp,ls%pu) , cck(ls%pp,ls%pp) ,             &
                                llk(      ls%pp) )
   allocate( aak_s(ls%pus,ls%pus) ,bbkt_s(ls%pp,ls%pus), fk_s(ls%pus) ,&
             bbk_s(ls%pp ,ls%pus) ,                      gk_s(ls%pp) , &
             tpsk(2*ls%pp,2*ls%pp),tusk(2*ls%spus,2*ls%spus) )
   allocate(   a22i_l(ls%cpus,ls%cpus) , a22ita12t(ls%cpus,ls%pus) )
   !$omp single
    allocate( a22ia21(ls%cpus, ls%pus,grid%ne) , &
              a22ib2t(ls%cpus, ls%pp ,grid%ne) , &
              a22if2 (ls%cpus,        grid%ne) )
    if(present(a22i)) &
      allocate( a22i(ls%cpus,ls%cpus,grid%ne) , &
             a12a22i(ls%pus ,ls%cpus,grid%ne) , &
              b2a22i(ls%pp  ,ls%cpus,grid%ne) )
   !$omp end single
   allocate( bk(ls%pu) , uk(ls%pu) , uk2(grid%d,ubase%pk) , &
             us(grid%d,ubase%ms) , mmk(ls%pu,ls%pu) )
   !--------------------------------------------------------------------

   !$omp do schedule(static)
   elem_loop: do ie=1,grid%ne
  
     !------------------------------------------------------------------
     !1) compute local matrices
     if(present(uuu)) then
       ! get the element velocity and pass it to the local matrix
       do id=1,grid%d
         uk(id:ls%pu:grid%d) = uuu( (udofs%dofs(:,ie)-1)*grid%d + id )
       enddo
       call cg_ns_locmat( aak , bbk , fk , ubase , pbase ,        &
                          grid%e(ie) , bcs%b_e2be(ie) , b_err_x , &
                          eb_p_stab , cck , div_lag , llk , uk )
     else
       ! linear operator
       call cg_ns_locmat( aak , bbk , fk , ubase , pbase ,        &
                          grid%e(ie) , bcs%b_e2be(ie) , b_err_x , &
                          eb_p_stab , cck , div_lag , llk )
     endif
     ! In the following, we rely on the fact that cg_ns_locmat has set
     ! cck to zero if there is no element based pressure
     ! stabilization. This simplifies accumulating the contributions
     ! (possibly) comcoming from the static condensation.
  
     if(b_err_x%lerr) then
       !$omp critical
       b_err = b_err_x
       !$omp end critical
     endif
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     !1.5) Rescale the matrix for time dependent problems
     if(present(sigma)) then
       call cg_ns_locmassmat( mmk , ubase , grid%e(ie) )
       aak = sigma*aak + mmk
       bbk = sigma*bbk
       cck = sigma*cck
       llk = sigma*llk
       if(present(uuu)) then
         do id=1,grid%d
           bk(id:ls%pu:grid%d) = bbb( (udofs%dofs(:,ie)-1)*grid%d + id )
         enddo
         fk = sigma*fk + matmul(mmk,bk)
       endif
     endif
     !------------------------------------------------------------------
  
     !------------------------------------------------------------------
     !2) static condensation
     if(ls%stat_cnd) then
       ! local inversion
       if(present(uuu)) then ! the matrix is not symmetric
         call invmat     ( aak(ls%pus+1:ls%pu,ls%pus+1:ls%pu) , a22i_l )
       else ! the matrix is symmetric
         call invmat_chol( aak(ls%pus+1:ls%pu,ls%pus+1:ls%pu) , a22i_l )
       endif
       ! velocity equation
       a22ia21(:,:,ie) = matmul( a22i_l , aak(ls%pus+1:ls%pu,1:ls%pus) )
       a22if2 (:,  ie) = matmul( a22i_l , fk (ls%pus+1:ls%pu) )
       aak_s = aak(1:ls%pus,1:ls%pus) &
                - matmul(aak(1:ls%pus,ls%pus+1:ls%pu) , a22ia21(:,:,ie))
       fk_s  = fk (1:ls%pus) &
                - matmul(aak(1:ls%pus,ls%pus+1:ls%pu) , a22if2 (:,ie))
       ! pressure equation
       a22ib2t(:,:,ie) = matmul(a22i_l,transpose(bbk(:,ls%pus+1:ls%pu)))
       bbk_s = bbk(:,1:ls%pus) &
                 - matmul( bbk(:,ls%pus+1:ls%pu) , a22ia21(:,:,ie) )
       if(present(uuu)) then ! the B and B^T terms are not symmetric
         a22ita12t =                                                &
           transpose(matmul( aak(1:ls%pus,ls%pus+1:ls%pu) , a22i_l ))
         bbkt_s = bbk(:,1:ls%pus) & ! first contribution is the same
                 - matmul( bbk(:,ls%pus+1:ls%pu) , a22ita12t )
       else ! the matrix is symmetric
         bbkt_s = bbk_s
       endif
       cck  = cck - matmul( bbk(:,ls%pus+1:ls%pu) , a22ib2t(:,:,ie) )
       gk_s =     - matmul( bbk(:,ls%pus+1:ls%pu) , a22if2(:,ie) )
       if(present(a22i)) then
            a22i(:,:,ie) = a22i_l
         a12a22i(:,:,ie) = matmul( aak(1:ls%pus,ls%pus+1:ls%pu),a22i_l )
          b2a22i(:,:,ie) = matmul( bbk( :      ,ls%pus+1:ls%pu),a22i_l )
       endif
     else
       aak_s  = aak
       fk_s   = fk
       bbk_s  = bbk
       bbkt_s = bbk
       ! nothing to be added to cck
       gk_s = 0.0_wp
     endif
     !------------------------------------------------------------------
  
     !------------------------------------------------------------------
     !3.1) node oriented summation of the element contributions
     s_e = (ie-1)*ls%enz ! element shift

     ! velocity columns and rhs fk_s (excluded lambda terms)
     do j=1,ls%spus; do jd=1,grid%d
       jj = jd + (j-1)*grid%d

       do i=1,ls%spus; do id=1,grid%d
         ii = id + (i-1)*grid%d
         pos = ls%euu(ii,jj) + s_e
         !$omp atomic
          mmm%t2c(pos)%p = mmm%t2c(pos)%p + aak_s(ii,jj)
       enddo; enddo

       do i=1,ls%pp
         pos = ls%epu(i,jj) + s_e
         !$omp atomic
          mmm%t2c(pos)%p = mmm%t2c(pos)%p + bbk_s (i ,jj)
         pos = ls%eup(jj,i) + s_e
         !$omp atomic
          mmm%t2c(pos)%p = mmm%t2c(pos)%p + bbkt_s(i ,jj)
       enddo

       jjj = (udofs%dofs(j,ie)-1)*grid%d + jd
       !$omp atomic
        fff(jjj) = fff(jjj) + fk_s(jj)
     enddo; enddo

     ! pressure columns and rhs gk_s (excluded lambda terms)
     if(ls%stat_cnd.or.eb_p_stab) then
       do j=1,ls%pp
         do i=1,ls%pp
           pos = ls%epp(i,j) + s_e
           !$omp atomic
            mmm%t2c(pos)%p = mmm%t2c(pos)%p + cck(i,j)
         enddo
         jjj = pdofs%dofs(j,ie) + ls%nu
         !$omp atomic
          fff(jjj) = fff(jjj) + gk_s(j)
       enddo
     endif

     ! matrix Lambda
     if(div_lag) then
       do j=1,ls%pp
         pos = ls%elp(j) + s_e
         !$omp atomic
          mmm%t2c(pos)%p = mmm%t2c(pos)%p + llk(j)
         pos = ls%epl(j) + s_e
         !$omp atomic
          mmm%t2c(pos)%p = mmm%t2c(pos)%p + llk(j)
       enddo
     endif
     !------------------------------------------------------------------
  
   enddo elem_loop
   !$omp end do nowait

   ! The side based advection stabilization is relevant here only in
   ! the implicit advection case
   u_u_stab = u_stab.and.present(uuu)

   side_stab_if: if(sb_p_stab.or.u_u_stab) then
     !$omp do schedule(static)
     side_loop: do is=1,grid%ni

       s_s = (is-1)*ls%snz + grid%ne*ls%enz ! side shift

       press_stab_if: if(sb_p_stab) then
         !--------------------------------------------------------------
         !1) compute local matrix
         call cg_ns_locpstab( tpsk , pbase , grid%s(is) , b_err_x )

         if(b_err_x%lerr) b_err = b_err_x
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !1.5) Rescale the matrix for time dependent problems
         if(present(sigma)) tpsk = sigma*tpsk
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !3.1) node oriented summation of the side contributions (see
         !the comments in cg_ns_locpstab)
         do jk=1,2; do j=1,ls%pp
           jj = j + (jk-1)*ls%pp
           do ik=1,2; do i=1,ls%pp
             ii = i + (ik-1)*ls%pp
             pos = ls%spp(ii,jj) + s_s
             !$omp atomic
              mmm%t2c(pos)%p = mmm%t2c(pos)%p + tpsk(ii,jj)
           enddo; enddo
         enddo; enddo
         !--------------------------------------------------------------
       endif press_stab_if

       advec_stab_if: if(u_u_stab) then
         !--------------------------------------------------------------
         !1) compute local matrix
         ie = grid%s(is)%ie(1) ! get the velocity from the first el.
         i  = grid%s(is)%isl(1)
         do id=1,grid%d
           uk2(id,:) = uuu( (udofs%dofs(:,ie)-1)*grid%d + id )
         enddo
         us = matmul( uk2 , ubase%pb(:,                                &
                    ubase%stab(grid%e(ie)%pi(i),:), grid%s(is)%isl(1)) )
         call cg_ns_locustab(tusk,ubase,grid%s(is),us,b_err_x)

         if(b_err_x%lerr) b_err = b_err_x
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !1.5) Rescale the matrix for time dependent problems
         if(present(sigma)) tusk = sigma*tusk
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !3.1) node oriented summation of the side contributions (see
         !the comments in cg_ns_locpstab)
         do jk=1,2; do j=1,ls%spus ! no static cond. in this case
           jj = ((j-1) + (jk-1)*ls%spus)*grid%d
           do ik=1,2; do i=1,ls%spus
             ii = ((i-1) + (ik-1)*ls%spus)*grid%d

             do id=1,grid%d
               pos = ls%suu(ii+id,jj+id) + s_s
               !$omp atomic
                mmm%t2c(pos)%p = mmm%t2c(pos)%p +                      &
                                 tusk(i+(ik-1)*ls%spus,j+(jk-1)*ls%spus)
             enddo
           enddo; enddo
         enddo; enddo
         !--------------------------------------------------------------
       endif advec_stab_if

     enddo side_loop
     !$omp end do nowait

     !$omp do schedule(static)
     ddc_side_loop: do is=1,ddc_grid%nns

       s_s = (is-1)*ls%snz + grid%ni*ls%snz + grid%ne*ls%enz

       ddc_press_stab_if: if(sb_p_stab) then
         !--------------------------------------------------------------
         !1) compute local matrix (s_copy sees the ghost element)
         call cg_ns_locpstab( tpsk , pbase ,                           &
                      bcs%b_s2bs(ddc_grid%ns(is)%i)%p%s_copy , b_err_x )

         if(b_err_x%lerr) b_err = b_err_x
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !1.5) Rescale the matrix for time dependent problems
         if(present(sigma)) tpsk = sigma*tpsk
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !3.1) node oriented summation of the side contributions: the
         !coefficient 0.5 is due to the fact that the side
         !stabilization is computed by both subdomains
         tpsk = 0.5_wp * tpsk
         do jk=1,2; do j=1,ls%pp
           jj = j + (jk-1)*ls%pp
           do ik=1,2; do i=1,ls%pp
             ii = i + (ik-1)*ls%pp
             pos = ls%spp(ii,jj) + s_s
             !$omp atomic
              mmm%t2c(pos)%p = mmm%t2c(pos)%p + tpsk(ii,jj)
           enddo; enddo
         enddo; enddo
         !--------------------------------------------------------------
       endif ddc_press_stab_if

       ddc_advec_stab_if: if(u_u_stab) then
         !--------------------------------------------------------------
         !1) compute local matrix
         ie = grid%s(ddc_grid%ns(is)%i)%ie(1) ! velocity from first el.
         i  = grid%s(ddc_grid%ns(is)%i)%isl(1)
         do id=1,grid%d
           uk2(id,:) = uuu( (udofs%dofs(:,ie)-1)*grid%d + id )
         enddo
         us = matmul( uk2 , ubase%pb(:,                                &
                                    ubase%stab(grid%e(ie)%pi(i),:), i) )
         call cg_ns_locustab( tusk , ubase ,                           &
                 bcs%b_s2bs(ddc_grid%ns(is)%i)%p%s_copy , us , b_err_x )

         if(b_err_x%lerr) b_err = b_err_x
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !1.5) Rescale the matrix for time dependent problems
         if(present(sigma)) tusk = sigma*tusk
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         !3.1) node oriented summation of the side contributions
         tusk = 0.5_wp * tusk
         do jk=1,2; do j=1,ls%spus ! no static cond. in this case
           jj = ((j-1) + (jk-1)*ls%spus)*grid%d
           do ik=1,2; do i=1,ls%spus
             ii = ((i-1) + (ik-1)*ls%spus)*grid%d

             do id=1,grid%d
               pos = ls%suu(ii+id,jj+id) + s_s
               !$omp atomic
                mmm%t2c(pos)%p = mmm%t2c(pos)%p +                      &
                                 tusk(i+(ik-1)*ls%spus,j+(jk-1)*ls%spus)
             enddo
           enddo; enddo
         enddo; enddo
         !--------------------------------------------------------------
       endif ddc_advec_stab_if

     enddo ddc_side_loop
     !$omp end do nowait
   endif side_stab_if

   deallocate( aak , fk , bbk , cck )
   deallocate( aak_s , bbkt_s , fk_s , bbk_s , gk_s , llk , &
               tpsk , tusk )
   deallocate( a22i_l , a22ita12t )
   deallocate( bk , uk , uk2 , us , mmk )
   !$omp end parallel
   !$ t1_omp = omp_get_wtime()
   !$ write(message,elapsed_format) &
   !$   'Completed local matrices computation: OpenMP elapsed time ',&
   !$   t1_omp-t0_omp,'s.'
   !$ call info(this_sub_name,this_mod_name,message)
   !$ if(detailed_timing_omp) then
   !$   call omput_write_time()
   !$   call omput_close_timer()
   !$   call omput_pop_key()
   !$ endif

 end subroutine ns_mat
 
!-----------------------------------------------------------------------
 
 !> Collect the Dirichlet data.
 pure subroutine ns_dirdata(uuu2,udofs,coeff_dir,ddc_udofs,u_stab)
  type(t_cgdofs), intent(in) :: udofs
  real(wp), intent(out) :: uuu2(:)
  procedure(i_coeff_dir) :: coeff_dir
  type(t_ddc_cgdofs), intent(in) :: ddc_udofs
  logical, intent(in) :: u_stab
 
  integer :: i, is, ie, ii, jj
  real(wp) :: uuu2t(udofs%d,size(udofs%dir_dofs,2))
  character(len=*), parameter :: &
    this_sub_name = 'ns_dirdata'

   do i=1,size(udofs%idir_dofs,2) ! loop on the boundary regions
     ! If there is at least one dof on the i-th boundary region
     is = udofs%idir_dofs(1,i)
     ie = udofs%idir_dofs(2,i)
     if(ie.ge.is) &
       uuu2t(:,is:ie) = coeff_dir(            &
         udofs%x(:,udofs%dir_dofs(1,is:ie)) , &
                   udofs%dir_dofs(3,is)     )
   enddo
   uuu2(1:udofs%d*size(udofs%dir_dofs,2))=reshape(uuu2t,(/size(uuu2t)/))

   ! ghost dir dofs
   if(u_stab) then
     do i=1,ddc_udofs%n_ghost_dofs
       ii = ddc_udofs%added_ghost_dofs(i,1)
       jj = ddc_udofs%added_ghost_dofs(i,2)
       if(ddc_udofs%ghost_dofs(ii,jj)%dir) then
         is = (ddc_udofs%ghost_dofs(ii,jj)%i_dir_nat-1)*udofs%d + 1
         ie = is + udofs%d - 1
         uuu2(is:ie) =  reshape( coeff_dir(                         &
           reshape( ddc_udofs%ghost_dofs(ii,jj)%x ,(/udofs%d,1/)) , &
                   ddc_udofs%ghost_dofs(ii,jj)%breg ) , (/udofs%d/) )
       endif
     enddo
   endif
 
 end subroutine ns_dirdata
 
!-----------------------------------------------------------------------
 
 !> Initialize \c uuu1 (see \c ns_packsol for details)
 !!
 !! This subroutine initialize \c uuu1 given the complete solution \c
 !! uuu. Usually, this operation is not required, since \c uuu1 is
 !! computed directly solving a linear system. It can be useful,
 !! however, to initialize \c uuu1 before starting an iterative
 !! solver.
 !<
 pure subroutine ns_unpacksol(uuu1,uuu,dofs_nat,ls,grid)
  real(wp), intent(in) :: uuu(:)
  integer, intent(in) :: dofs_nat(:)
  type(t_ls), intent(in) :: ls
  type(t_grid), target, intent(in) :: grid
  real(wp), intent(out) :: uuu1(:)
 
  integer :: nsc

   nsc = ls%cpus*grid%ne

   uuu1(1:ls%ndofs_natv) = uuu(dofs_nat(1:ls%ndofs_natv)+1)
   uuu1(ls%ndofs_natv+1:ls%ndofs_nat) = &
          uuu(nsc+dofs_nat(ls%ndofs_natv+1:ls%ndofs_nat)+1)
 
 end subroutine ns_unpacksol
 
!-----------------------------------------------------------------------
 
 !> Reconstruction of the complete solution vector.
 !!
 !! The solution vector differs from the solution of the linear system
 !! for the following reasons:
 !! <ul>
 !!  <li> some degrees of freedom can be statically eliminated in the
 !!  linear system
 !!  <li> Dirichlet boundary conditions.
 !! </ul>
 !!
 !! This subroutine takes all these factors into account and returns a
 !! complete vector, which size matches the sizes of the velocity and
 !! pressure finite element spaces.
 !<
 pure subroutine ns_packsol(uuu,ubase,pbase,grid,udofs,pdofs, &
         dofs_nat,dofs_dir,ls,uuu1,uuu2,a22if2,a22ia21,a22ib2t)
  type(t_base), intent(in) :: ubase, pbase
  type(t_grid), target, intent(in) :: grid
  type(t_cgdofs), intent(in) :: udofs, pdofs
  integer, intent(in) :: dofs_nat(:), dofs_dir(:)
  type(t_ls), intent(in) :: ls
  real(wp), intent(in) :: uuu1(:), uuu2(:), &
    a22if2(:,:), a22ia21(:,:,:), a22ib2t(:,:,:)
  real(wp), intent(out) :: uuu(:)
 
  integer :: nsc, idx(udofs%d), ie, i, js, je, idx2(ls%cpus)
  real(wp) :: u_s(ls%pus), p_s(pbase%pk)

   nsc = ls%cpus*grid%ne

   !---------------------------------------------------------------------
   !1) Dirichlet boundary conditions
   uuu(dofs_nat(1:ls%ndofs_natv)+1) = uuu1(1:ls%ndofs_natv)
   uuu(nsc+dofs_nat(ls%ndofs_natv+1:ls%ndofs_nat)+1) = &
                                   uuu1(ls%ndofs_natv+1:ls%ndofs_nat)
   uuu(dofs_dir+1) = uuu2
   !---------------------------------------------------------------------
   
   !---------------------------------------------------------------------
   !2) Statically condensed dofs
   if(nsc.gt.0) then
     idx = (/ (i, i=1,udofs%d) /)
     do ie=1,grid%ne
       ! local velocity
       do i=1,ls%spus
         js = (i-1)*udofs%d + 1
         je =    i *udofs%d
         u_s(js:je) = uuu( (udofs%dofs(i,ie)-1)*udofs%d + idx )
       enddo
       ! local pressure
       p_s = uuu(ls%nu+nsc+pdofs%dofs(:,ie))
       ! global indices of the reconstructed dofs
       do i=1,ubase%pk-ls%spus
         js = (i-1)*udofs%d + 1
         je =    i *udofs%d
         idx2(js:je) = (udofs%dofs(ls%spus+i,ie)-1)*udofs%d + idx  
       enddo
       uuu(idx2) = a22if2(:,ie) &
         -matmul(a22ia21(:,:,ie),u_s)-matmul(a22ib2t(:,:,ie),p_s)
     enddo
   endif
   !---------------------------------------------------------------------

 end subroutine ns_packsol
 
!-----------------------------------------------------------------------
 
 !> Strip the ghost degrees of freedom
 pure subroutine trim_sol(uut,uuu,grid,udofs,pdofs,ls)
  real(wp),              intent(in) :: uuu(:)
  type(t_grid),          intent(in) :: grid
  type(t_cgdofs),        intent(in) :: udofs, pdofs
  type(t_ls),            intent(in) :: ls
  real(wp), allocatable, intent(out) :: uut(:)

   allocate(uut( udofs%d*udofs%ndofs + pdofs%ndofs + ls%ndofs_natl ))

   ! velocity dofs
   uut(1:udofs%d*udofs%ndofs) = uuu(1:udofs%d*udofs%ndofs)

   ! pressure dofs
   uut(udofs%d*udofs%ndofs+1:udofs%d*udofs%ndofs+pdofs%ndofs) = &
     uuu( ls%ndofs_dir+ls%ndofs_natv+ls%cpus*grid%ne + 1    :      &
          ls%ndofs_dir+ls%ndofs_natv+ls%cpus*grid%ne + pdofs%ndofs )

   ! lambda
   if(ls%ndofs_natl.eq.1) uut(size(uut)) = uuu(size(uuu))

 end subroutine trim_sol

!-----------------------------------------------------------------------

 !> Reintroduce the ghost degrees of freedom by setting them to zero
 !!
 !! This can be useful when initializing a complete solution vector
 !! with data which do not include the ghost dofs. Notice that \c uuu
 !! is not reallocated, in contrast to what is done in \c trim_sol.
 !! This is because the dimension of uuu is handled in details in
 !! other subroutines of this module.
 !!
 !! \note Ghost dofs are set to zero. Usually, this implies only a
 !! small inaccuracy for the first time step.
 pure subroutine untrim_sol(uuu,uut,grid,udofs,pdofs,ls)
  real(wp),       intent(in) :: uut(:)
  type(t_grid),   intent(in) :: grid
  type(t_cgdofs), intent(in) :: udofs, pdofs
  type(t_ls),     intent(in) :: ls
  real(wp),       intent(out) :: uuu(:)

   ! set the default value
   uuu = 0.0_wp

   ! velocity dofs
   uuu(1:udofs%d*udofs%ndofs) = uut(1:udofs%d*udofs%ndofs)

   ! pressure dofs
   uuu( ls%ndofs_dir+ls%ndofs_natv+ls%cpus*grid%ne + 1    :          &
        ls%ndofs_dir+ls%ndofs_natv+ls%cpus*grid%ne + pdofs%ndofs ) = &
     uut(udofs%d*udofs%ndofs+1:udofs%d*udofs%ndofs+pdofs%ndofs)

   ! lambda
   if(ls%ndofs_natl.eq.1) uuu(size(uuu)) = uut(size(uut))

 end subroutine untrim_sol

!-----------------------------------------------------------------------
 
 !> Compute the Navier-Stokes right hand side.
 !!
 !! This function should be used only for time dependent problems with
 !! explicit treatment of the advection term. For steady state
 !! problems and fully implicit problems the right hand side is
 !! computed in \c ns_mat.
 !!
 !! The right hand side computed here is
 !! \f{displaymath}{
 !!  \underline{f}_{\sigma} = \underline{b} +
 !!    \sigma\left(\underline{f} -
 !!    \underline{u}\cdot\nabla\underline{u}\right).
 !! \f}
 !! The computed right hand side takes into account the static
 !! condensation, so that it can be used directly in combination with
 !! the matrix returned by \c ns_mat.
 !!
 !! The coefficients \f$\sigma\f$ and \f$\underline{b}\f$ are two
 !! generic coefficients according to the framework specified in \c
 !! mod_time_integrators. For example, using the explicit Euler
 !! method, the right hand side becomes
 !! \f{displaymath}{
 !!  \underline{f}_{\Delta t} = \underline{u}^n +
 !!    \Delta t\left(\underline{f} -
 !!    \underline{u}^n\cdot\nabla\underline{u}^n\right).
 !! \f}
 !<
 pure subroutine ns_rhs(fff,a22if2,ls,ubase,grid,udofs,pdofs, &
                        bbb,uuu,sigma,a22i,a12a22i,b2a22i)
  type(t_ls),       intent(in) :: ls
  type(t_base),     intent(in) :: ubase
  type(t_grid), target, intent(in) :: grid
  type(t_cgdofs),   intent(in) :: udofs, pdofs
  real(wp),         intent(in) :: bbb(:), uuu(:)
  real(wp),         intent(in) :: sigma
  real(wp),         intent(in) :: a22i(:,:,:), a12a22i(:,:,:), &
                                  b2a22i(:,:,:)
  real(wp),              intent(out) :: fff(:) !< rhs
  real(wp), allocatable, intent(out) :: a22if2(:,:)
  
  integer :: ie, id, i, ii, iii
  real(wp), allocatable :: fk(:), bk(:), uk(:), fk_s(:), gk_s(:)
  character(len=*), parameter :: &
    this_sub_name = 'ns_rhs'

   !--------------------------------------------------------------------
   ! allocations
   allocate( fk(ls%pu),uk(ls%pu),bk(ls%pu),fk_s(ls%pus),gk_s(ls%pp) )
   allocate( a22if2(ls%cpus,grid%ne) )
   !--------------------------------------------------------------------

   fff = 0.0_wp
   elem_loop: do ie=1,grid%ne

     !------------------------------------------------------------------
     !1) compute local matrices
     do id=1,grid%d
       bk(id:ls%pu:grid%d) = bbb( (udofs%dofs(:,ie)-1)*grid%d + id )
       uk(id:ls%pu:grid%d) = uuu( (udofs%dofs(:,ie)-1)*grid%d + id )
     enddo
     call cg_ns_locrhs( fk , ubase , grid%e(ie) , sigma , bk , uk )
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     !2) static condensation
     if(ls%stat_cnd) then
       a22if2(:,ie) = matmul( a22i(:,:,ie) , fk(ls%pus+1:ls%pu) )
       fk_s = fk(1:ls%pus) &
              - matmul( a12a22i(:,:,ie) , fk(ls%pus+1:ls%pu) )
       gk_s = - matmul(  b2a22i(:,:,ie) , fk(ls%pus+1:ls%pu) )
     else
       fk_s  = fk
       ! gk_s intentionally left unspecified 
     endif
     !------------------------------------------------------------------
  
     !------------------------------------------------------------------
     !3) node oriented summation of the element contributions
     do i=1,ls%spus; do id=1,grid%d
       ii  =            (i    -1)*grid%d + id
       iii = (udofs%dofs(i,ie)-1)*grid%d + id
       fff(iii) = fff(iii) + fk_s(ii)
     enddo; enddo

     if(ls%stat_cnd) then
       do i=1,ls%pp
         ii  =            i
         iii = pdofs%dofs(i,ie) + ls%nu
         fff(iii) = fff(iii) + gk_s(ii)
       enddo
     endif
     !------------------------------------------------------------------

   enddo elem_loop

   deallocate( fk , bk , uk , fk_s , gk_s )
 
 end subroutine ns_rhs
 
!-----------------------------------------------------------------------
 
end module mod_cg_ns_linsystem

